from bs4 import  BeautifulSoup
from PIL import Image
import re
import urllib.request
import urllib.parse
import  http.cookiejar
from time import sleep
#爬取此页面的评论
headers = (
    "User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0"
)
login_url = 'https://accounts.douban.com/login'
login_=urllib.request.urlopen(login_url)
login_data=login_.read().decode('utf-8')
# 判断验证码是否存在
captcha = re.findall('\w{24}:en', login_data)
num = ''
formdata = {
    'source':'None',
    'redir':'https://www.douban.com',
    "form_email":"15227230337",
    "form_password":"aq918927",
    'login':'登录'
}
if captcha:
    captcha_url = 'https://www.douban.com/misc/captcha?id=' + captcha[0] + '&size=s'
    captcha_image = urllib.request.urlopen(captcha_url).read()
    filename = '1.jpg'
    with open(filename, 'wb') as f:
        f.write(captcha_image)
    im = Image.open(filename)
    im.show()
    num = input('请输入验证码：')
    formdata['captcha-id']=captcha[0]
    formdata['captcha-solution']=num
from_data= urllib.parse.urlencode(formdata).encode(encoding='utf-8')
req = urllib.request.Request(login_url,data=from_data)
req.add_header (
    "User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0"
)
cjar = http.cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
urllib.request.install_opener(opener)
data =opener.open(req).read().decode('utf-8')
n=0
while n<20:
    print('第 {}. 次'.format(n))
    url = 'https://movie.douban.com/subject/5350027/comments?start=' + str(n*20) + '&limit=20&sort=new_score&status=P&percent_type='
    print(url)
    data  = opener.open(url).read()
    soup = BeautifulSoup(data, 'lxml')
    # 获取用户昵称
    comment_name = soup.select('span[class="comment-info"] > a')
    # 评分
    comment_score = soup. select('span[class="rating"]')
    # 时间
    comment_time = soup.select('span[class="comment-time "]')
    # 内容
    comment_content = soup.select('div[class="comment"] > p')
    with open('comment.txt','a+',encoding='utf-8') as f:

        for i in comment_content:
            # print('用户名：'+comment_name[i].string)
            # print('发布时间：'+comment_time[i].string)
            f.write(i.get_text())
        f.write('--------------------------------------------------------------')
    sleep(5)

    n+=1

